import pickle
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
%matplotlib inline
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
print(results.best_score)
results.best_params
0.7708171336222225
{'model': 'LogisticRegression()',
'C': 0.05676171712944806,
'imputer': "SimpleImputer(strategy='most_frequent')",
'scaler': 'StandardScaler()',
'pca': "PCA('mle')",
'encoder': 'OneHotEncoder()'}
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
print(X_train.shape)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
print(len(y_train))
with open('../X_test.pkl', 'rb') as handle:
X_test = pickle.load(handle)
print(X_test.shape)
with open('../y_test.pkl', 'rb') as handle:
y_test = pickle.load(handle)
print(len(y_test))
(800, 20) 800 (200, 20) 200
X_train.head()
| checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | other_parties | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29 | NaN | NaN | delayed previously | business | 0.0 | <100 | >=7 | 3.0 | male single | none | 4.0 | no known property | 63.0 | none | own | 2.0 | skilled | 1.0 | yes | yes |
| 535 | >=200 | 21.0 | critical/other existing credit | education | 2319.0 | <100 | <1 | 2.0 | male div/sep | none | 1.0 | car | 33.0 | none | rent | 1.0 | skilled | 1.0 | none | yes |
| 695 | no checking | 6.0 | existing paid | used car | 1236.0 | 500<=X<1000 | 1<=X<4 | 2.0 | male single | none | 4.0 | life insurance | 50.0 | none | rent | 1.0 | skilled | 1.0 | none | yes |
| 557 | no checking | 21.0 | no credits/all paid | new car | 5003.0 | no known savings | 1<=X<4 | 1.0 | female div/dep/mar | none | 4.0 | life insurance | 29.0 | bank | own | 2.0 | skilled | 1.0 | yes | yes |
| 836 | no checking | 12.0 | existing paid | radio/tv | 886.0 | no known savings | 1<=X<4 | 4.0 | female div/dep/mar | none | 2.0 | car | 21.0 | none | own | 1.0 | skilled | 1.0 | none | yes |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
X_test.head()
| checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | other_parties | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 521 | <0 | 18.0 | existing paid | radio/tv | 3190.0 | <100 | 1<=X<4 | 2.0 | female div/dep/mar | none | 2.0 | real estate | 24.0 | none | own | 1.0 | skilled | 1.0 | none | yes |
| 737 | <0 | 18.0 | existing paid | new car | 4380.0 | 100<=X<500 | 1<=X<4 | 3.0 | male single | none | 4.0 | car | 35.0 | none | own | 1.0 | unskilled resident | 2.0 | yes | yes |
| 740 | <0 | 24.0 | all paid | new car | 2325.0 | 100<=X<500 | 4<=X<7 | 2.0 | male single | none | 3.0 | car | 32.0 | bank | own | 1.0 | skilled | 1.0 | none | yes |
| 660 | >=200 | 12.0 | existing paid | radio/tv | 1297.0 | <100 | 1<=X<4 | 3.0 | male mar/wid | none | 4.0 | real estate | 23.0 | none | rent | 1.0 | skilled | 1.0 | none | yes |
| 411 | no checking | 33.0 | critical/other existing credit | used car | 7253.0 | <100 | 4<=X<7 | 3.0 | male single | none | 2.0 | car | 35.0 | none | own | 2.0 | high qualif/self emp/mgmt | 1.0 | yes | yes |
y_test[0:10]
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'] ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
results.best_params
{'model': 'LogisticRegression()',
'C': 0.05676171712944806,
'imputer': "SimpleImputer(strategy='most_frequent')",
'scaler': 'StandardScaler()',
'pca': "PCA('mle')",
'encoder': 'OneHotEncoder()'}
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
full_pipeline = make_pipeline(
ColumnTransformer([
(
'numeric',
make_pipeline(
SimpleImputer(strategy='most_frequent'),
StandardScaler(),
PCA(n_components='mle')
),
numeric_columns
),
(
'non_numeric',
make_pipeline(OneHotEncoder()),
non_numeric_columns
)
]),
LogisticRegression(
C=results.best_params['C'],
solver='lbfgs',
max_iter=1000,
random_state=42
)
)
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps
{'columntransformer': ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='most_frequent')),
('standardscaler',
StandardScaler()),
('pca',
PCA(n_components='mle'))]),
['duration', 'credit_amount',
'installment_commitment', 'residence_since',
'age', 'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps=[('onehotencoder',
OneHotEncoder())]),
['checking_status', 'credit_history',
'purpose', 'savings_status', 'employment',
'personal_status', 'other_parties',
'property_magnitude', 'other_payment_plans',
'housing', 'job', 'own_telephone',
'foreign_worker'])]),
'logisticregression': LogisticRegression(C=0.05676171712944806, max_iter=1000, random_state=42)}
fitted_pipeline = full_pipeline.fit(X_train, y_train)
# fitted_pipeline.predict(X_test)
# fitted_pipeline.predict_proba(X_test)
# y_test
Understand the nature/degree of model overfitting by comparing to Test Set Evaluation
training_evaluator = hlp.sklearn_eval.TwoClassEvaluator(
actual_values=y_train,
predicted_scores=fitted_pipeline.predict_proba(X_train)[:, 1],
score_threshold=0.5
)
training_evaluator.plot_predicted_scores_histogram()
training_evaluator.plot_actual_vs_predict_histogram()
predicted_scores = fitted_pipeline.predict_proba(X_test)[:, 1]
# save the predictions so that we can compare across models
with open('test_set_predictions.pkl', 'wb') as handle:
pickle.dump(predicted_scores, handle)
evaluator = hlp.sklearn_eval.TwoClassEvaluator(
actual_values=y_test,
predicted_scores=predicted_scores,
positive_class='Defaulted',
negative_class='Not Defaulted',
score_threshold=0.5
)
del predicted_scores
evaluator.plot_predicted_scores_histogram()
evaluator.plot_actual_vs_predict_histogram()
evaluator.plot_confusion_matrix()
evaluator.all_metrics_df(return_style=True,
dummy_classifier_strategy=['prior', 'constant'],
round_by=3)
| Score | Dummy (prior) | Dummy (constant) | Explanation | |
|---|---|---|---|---|
| AUC | 0.796 | 0.500 | 0.500 | Area under the ROC curve (true pos. rate vs false pos. rate); ranges from 0.5 (purely random classifier) to 1.0 (perfect classifier) |
| True Positive Rate | 0.441 | 0.000 | 1.000 | 44.1% of positive instances were correctly identified.; i.e. 26 "Defaulted" labels were correctly identified out of 59 instances; a.k.a Sensitivity/Recall |
| True Negative Rate | 0.908 | 1.000 | 0.000 | 90.8% of negative instances were correctly identified.; i.e. 128 "Not Defaulted" labels were correctly identified out of 141 instances |
| False Positive Rate | 0.092 | 0.000 | 1.000 | 9.2% of negative instances were incorrectly identified as positive; i.e. 13 "Not Defaulted" labels were incorrectly identified as "Defaulted", out of 141 instances |
| False Negative Rate | 0.559 | 1.000 | 0.000 | 55.9% of positive instances were incorrectly identified as negative; i.e. 33 "Defaulted" labels were incorrectly identified as "Not Defaulted", out of 59 instances |
| Positive Predictive Value | 0.667 | 0.000 | 0.295 | When the model claims an instance is positive, it is correct 66.7% of the time; i.e. out of the 39 times the model predicted "Defaulted", it was correct 26 times; a.k.a precision |
| Negative Predictive Value | 0.795 | 0.705 | 0.000 | When the model claims an instance is negative, it is correct 79.5% of the time; i.e. out of the 161 times the model predicted "Not Defaulted", it was correct 128 times |
| F1 Score | 0.531 | 0.000 | 0.456 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. |
| Accuracy | 0.770 | 0.705 | 0.295 | 77.0% of instances were correctly identified |
| Error Rate | 0.230 | 0.295 | 0.705 | 23.0% of instances were incorrectly identified |
| % Positive | 0.295 | 0.295 | 0.295 | 29.5% of the data are positive; i.e. out of 200 total observations; 59 are labeled as "Defaulted" |
| Total Observations | 200 | 200 | 200 | There are 200 total observations; i.e. sample size |
fig = evaluator.plot_auc_curve(return_plotly=True)
fig.show()
<Figure size 720x444.984 with 0 Axes>
fig = evaluator.plot_threshold_curves(score_threshold_range=(0.1, 0.6),
return_plotly=True)
fig.show()
<Figure size 720x444.984 with 0 Axes>
fig = evaluator.plot_precision_recall_tradeoff(score_threshold_range=(0.1, 0.6),
return_plotly=True)
fig.show()
<Figure size 720x444.984 with 0 Axes>
evaluator.calculate_lift_gain(return_style=True)
| Gain | Lift | |
|---|---|---|
| Percentile | ||
| 5 | 0.15 | 3.05 |
| 10 | 0.24 | 2.37 |
| 15 | 0.34 | 2.26 |
| 20 | 0.44 | 2.20 |
| 25 | 0.54 | 2.17 |
| 30 | 0.63 | 2.09 |
| 35 | 0.69 | 1.99 |
| 40 | 0.75 | 1.86 |
| 45 | 0.76 | 1.69 |
| 50 | 0.80 | 1.59 |
| 55 | 0.83 | 1.51 |
| 60 | 0.85 | 1.41 |
| 65 | 0.88 | 1.36 |
| 70 | 0.93 | 1.33 |
| 75 | 0.95 | 1.27 |
| 80 | 0.98 | 1.23 |
| 85 | 0.98 | 1.16 |
| 90 | 0.98 | 1.09 |
| 95 | 0.98 | 1.03 |
| 100 | 1.00 | 1.00 |
evaluator.calculate_lift_gain(return_style=True, include_all_info=True)
| # of Obs. | # of Pos. Events | Cumul. Pos. Events | Gain | Lift | |
|---|---|---|---|---|---|
| Percentile | |||||
| 5 | 10 | 9 | 9 | 0.15 | 3.05 |
| 10 | 10 | 5 | 14 | 0.24 | 2.37 |
| 15 | 10 | 6 | 20 | 0.34 | 2.26 |
| 20 | 10 | 6 | 26 | 0.44 | 2.20 |
| 25 | 10 | 6 | 32 | 0.54 | 2.17 |
| 30 | 10 | 5 | 37 | 0.63 | 2.09 |
| 35 | 10 | 4 | 41 | 0.69 | 1.99 |
| 40 | 10 | 3 | 44 | 0.75 | 1.86 |
| 45 | 10 | 1 | 45 | 0.76 | 1.69 |
| 50 | 10 | 2 | 47 | 0.80 | 1.59 |
| 55 | 10 | 2 | 49 | 0.83 | 1.51 |
| 60 | 10 | 1 | 50 | 0.85 | 1.41 |
| 65 | 10 | 2 | 52 | 0.88 | 1.36 |
| 70 | 10 | 3 | 55 | 0.93 | 1.33 |
| 75 | 10 | 1 | 56 | 0.95 | 1.27 |
| 80 | 10 | 2 | 58 | 0.98 | 1.23 |
| 85 | 10 | 0 | 58 | 0.98 | 1.16 |
| 90 | 10 | 0 | 58 | 0.98 | 1.09 |
| 95 | 10 | 0 | 58 | 0.98 | 1.03 |
| 100 | 10 | 1 | 59 | 1.00 | 1.00 |
from sklearn.inspection import permutation_importance
import time
estimator = full_pipeline
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 1.273 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()